notebook.community

Edit and run



In [1]:

    
import words
reload(words)









    Out[1]:





<module 'words' from 'words.pyc'>



In [67]:

    
CACHE = False

import samples
reload(samples)
data = samples.load_samples(["Keywords", "UK", "Georgia", "Mexico", "EU"], cache=CACHE)
keywords = samples.load_samples(["Keywords"], cache=CACHE)
canada = samples.load_samples(["Canada"], cache=CACHE)
moldova = samples.load_samples(["Moldova"], cache=CACHE)
unops = samples.load_samples(["UNOPS"], cache=CACHE)
entities = list( set(x['entity'] for x in keywords) )

entities









    Out[67]:





['notice',
 'good',
 'solicitation',
 'contract',
 'supplier',
 'authority',
 'buyer',
 '?']



In [3]:

    
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn import cross_validation



In [58]:

    
slices = {}
for i, row in enumerate(data):
    slices.setdefault(row['sample'], []).append(i)

I. Define Raw Data



In [5]:

    
def organize_data(data):
    organized = []
    for k, headers in data.items():
        for header in headers:
            organized.append({'entity': k, 'header': header})
    return organized

II. Define Features



In [11]:

    
def length(df):
    return df['header'].apply(len)

def word_count(df):
    return df['header'].apply(lambda x: len(list(words.split_words(x))))

def header_in_entity(df):
    return df['header'].str.lower().isin(df['entity'].str.lower())

def entity_in_header(df):
    return df['entity'].str.lower().isin(df['header'].str.lower())

def entity_feature(name):
    entity_set = set(x['header'] for x in data if x['entity'] == name)
    
    def fn(x):
        #print name, x, words.subsetness(x, entity_set)
        try:
            return words.subsetness(x, entity_set)
        except:
            return 0
    
    def entity_feature(df):
        return df['header'].apply( fn )
    
    entity_feature.func_name = 'entity_%s' % name    
    return entity_feature

entity_features = [entity_feature(name) for name in entities]



In [11]:

III. Combine Features into Feature Matrix & Define Outcome

IV. Create Model

V. Split Data into Test and Training

Fit and Test Models



In [85]:

    
from sklearn.ensemble import RandomForestClassifier

class Model(object):
    def __init__(self, samples, outcome_key='entity', svm=RandomForestClassifier(n_estimators=10)):
        self.samples = samples
        self.svm = svm
        self.frame = pd.DataFrame(self.samples)
        self.outcome_key = outcome_key
        self.features_built = set()
    
    def test(self, features, iterations=5, train_size=0.35, test_size=.25, seed=0): 
        X = self.build(self.frame, features)
        y = self.frame[self.outcome_key]
        
        rs = cross_validation.ShuffleSplit(len(X), n_iter=iterations, train_size=train_size, test_size=test_size, random_state=seed)
        
        accuracies = []
        for train_index, test_index in rs:
            model = self.svm.fit(X.ix[train_index], y.ix[train_index])
            actual = y.ix[test_index].values
            predicted = model.predict(X.ix[test_index])
            accuracies.append( self.score_model(actual, predicted) )
        
        print "Avg Accuracy: %%%.2f" % np.mean(accuracies)
    
    def test_sample(self, slice, features):
        X = self.build(self.frame, features)
        y = self.frame[self.outcome_key]
        
        model = self.svm.fit(X, y)
        actual = y.ix[slice].values
        predicted = model.predict(X.ix[slice])
        accuracy = self.score_model(actual, predicted)
        
        for i, a, p in zip(slice, actual, predicted):
            print self.samples[i]['header'].ljust(50), a.ljust(20), p
        
        print "Accuracy: %%%.2f" % accuracy
    
    def test_data(self, data, features):
        X = self.build(self.frame, features)
        y = self.frame[self.outcome_key]
        
        model = self.svm.fit(X, y)
        
        df = pd.DataFrame(data)
        z = self.build(df, features)
        
        actual = df.entity
        predicted = model.predict(z)
        accuracy = self.score_model(actual, predicted)
        
        for dct, a, p in zip(data, actual, predicted):
            print dct['header'].ljust(50), a.ljust(20), p
        
        print "Accuracy: %%%.2f" % accuracy
    
    def score_model(self, actual, predicted):
        score_df = pd.DataFrame([actual, predicted], index=['actual', 'predicted']).T
        correct = sum(score_df.actual == score_df.predicted)
        incorrect = sum(score_df.actual != score_df.predicted)
        total = correct + incorrect
        accuracy = float(correct) / float(total) * 100
        return accuracy
    
    def predict(self, headers, features):
        X = self.build(self.frame, features)
        y = self.frame[self.outcome_key]
        model = self.svm.fit(X, y)
        
        data = [{'header': h, 'entity': '?'} for h in headers]
        df = pd.DataFrame(data)
        z = self.build(df, features)
        
        self.df = df
        self.z = z
        
        predictions = model.predict(z)
        return zip(headers, predictions)
    
    def build(self, df, features):
        result = pd.DataFrame()
        for fn in features:
            result[fn.func_name] = fn(df)
        return result

model = Model(data)



In [13]:

    
model.test(features=[length, word_count])









    



Avg Accuracy: %37.95



In [14]:

    
model.test(features=[length, word_count, header_in_entity, entity_in_header])









    



Avg Accuracy: %49.09



In [86]:

    
model.test(features=[length, word_count, header_in_entity, entity_in_header] + entity_features)









    



Avg Accuracy: %79.77



In [16]:

    
model.test(features=entity_features)









    



Avg Accuracy: %78.64

Predict a dataset



In [65]:

    
model.test_sample(slices['Canada'], features= [length, word_count, header_in_entity, entity_in_header] + entity_features)









    



language                                           ?                    ?
procurement_entity_name                            authority            authority
title                                              good                 good
reference_number                                   notice               notice
solicitation_number                                solicitation         solicitation
contract_sequence_number                           contract             contract
contract                                           contract             notice
publishing_status                                  solicitation         solicitation
award_date                                         contract             contract
publication_date                                   notice               notice
amendment_date                                     notice               notice
gsin                                               good                 good
contract_award_procedure                           solicitation         solicitation
tendering_procedure                                solicitation         solicitation
procurement_entity                                 authority            authority
end_user_entity                                    buyer                buyer
customer_info                                      buyer                buyer
description                                        good                 good
supplier_info                                      supplier             buyer
currency                                           solicitation         solicitation
currency                                           notice               contract
currency                                           contract             contract
contract_value                                     solicitation         solicitation
contract_value                                     notice               contract
contract_value                                     contract             contract
Accuracy: %84.00



In [87]:

    
model.test_data(canada, features= [length, word_count, header_in_entity, entity_in_header] + entity_features)









    



language                                           ?                    ?
procurement_entity_name                            authority            buyer
title                                              good                 good
reference_number                                   notice               ?
solicitation_number                                solicitation         ?
contract_sequence_number                           contract             authority
contract_number                                    contract             solicitation
publishing_status                                  solicitation         solicitation
award_date                                         contract             solicitation
publication_date                                   notice               solicitation
amendment_date                                     notice               solicitation
gsin                                               good                 good
contract_award_procedure                           solicitation         ?
tendering_procedure                                solicitation         solicitation
procurement_entity                                 authority            ?
end_user_entity                                    buyer                solicitation
customer_info                                      buyer                solicitation
description                                        good                 good
supplier_info                                      supplier             solicitation
currency                                           solicitation         solicitation
currency                                           notice               solicitation
currency                                           contract             solicitation
contract_value                                     solicitation         solicitation
contract_value                                     notice               solicitation
contract_value                                     contract             solicitation
Accuracy: %32.00



In [37]:

    
model.df.join(model.z)









    Out[37]:






  
    
      
      entity
      header
      length
      word_count
      header_in_entity
      entity_in_header
      entity_notice
      entity_good
      entity_solicitation
      entity_contract
      entity_supplier
      entity_authority
      entity_buyer
      entity_?
    
  
  
    
      0 
       ?
                       language
        8
        6
       False
       False
       0.000000
       0.000000
       0.000000
       0.000000
       0.000000
       0.000000
       0.000000
       1.000000
    
    
      1 
       ?
        procurement_entity_name
       23
       17
       False
       False
       0.882353
       0.882353
       0.882353
       0.882353
       0.882353
       0.882353
       0.941176
       0.882353
    
    
      2 
       ?
                          title
        5
       12
       False
       False
       0.000000
       1.000000
       0.000000
       1.000000
       0.000000
       0.000000
       0.000000
       1.000000
    
    
      3 
       ?
               reference_number
       16
       28
       False
       False
       0.035714
       0.607143
       1.000000
       0.071429
       0.000000
       0.000000
       0.000000
       1.000000
    
    
      4 
       ?
            solicitation_number
       19
       20
       False
       False
       0.050000
       0.850000
       0.850000
       0.050000
       0.000000
       0.000000
       0.000000
       0.900000
    
    
      5 
       ?
       contract_sequence_number
       24
       36
       False
       False
       0.361111
       0.805556
       0.805556
       0.361111
       0.000000
       0.000000
       0.000000
       0.805556
    
    
      6 
       ?
                contract_number
       15
       29
       False
       False
       0.448276
       1.000000
       1.000000
       0.448276
       0.000000
       0.000000
       0.000000
       1.000000
    
    
      7 
       ?
              publishing_status
       17
        6
       False
       False
       0.333333
       0.000000
       1.000000
       0.500000
       0.000000
       0.000000
       0.000000
       0.000000
    
    
      8 
       ?
                     award_date
       10
       18
       False
       False
       1.000000
       0.277778
       1.000000
       1.000000
       0.277778
       0.277778
       0.000000
       0.277778
    
    
      9 
       ?
               publication_date
       16
       17
       False
       False
       0.764706
       0.000000
       1.000000
       1.000000
       0.000000
       0.000000
       0.000000
       0.000000
    
    
      10
       ?
                 amendment_date
       14
       15
       False
       False
       0.866667
       0.000000
       0.866667
       0.866667
       0.000000
       0.000000
       0.000000
       0.000000
    
    
      11
       ?
                           gsin
        4
       16
       False
       False
       0.000000
       1.000000
       0.437500
       0.000000
       0.000000
       0.000000
       0.000000
       0.000000
    
    
      12
       ?
       contract_award_procedure
       24
       21
       False
       False
       0.809524
       0.809524
       1.000000
       0.809524
       0.238095
       0.285714
       0.000000
       1.000000
    
    
      13
       ?
            tendering_procedure
       19
        8
       False
       False
       0.500000
       0.000000
       1.000000
       0.500000
       0.125000
       0.125000
       0.000000
       0.625000
    
    
      14
       ?
             procurement_entity
       18
        2
       False
       False
       0.000000
       0.000000
       0.000000
       0.000000
       0.000000
       0.000000
       0.500000
       0.000000
    
    
      15
       ?
                end_user_entity
       15
       22
       False
       False
       0.000000
       0.000000
       0.818182
       0.818182
       0.000000
       0.000000
       0.045455
       0.000000
    
    
      16
       ?
                  customer_info
       13
        2
       False
       False
       0.000000
       0.000000
       0.000000
       0.000000
       0.000000
       0.000000
       0.000000
       0.500000
    
    
      17
       ?
                    description
       11
        3
       False
       False
       0.000000
       1.000000
       0.000000
       0.000000
       0.000000
       0.000000
       0.000000
       0.000000
    
    
      18
       ?
                  supplier_info
       13
        2
       False
       False
       0.000000
       0.000000
       0.000000
       0.000000
       0.500000
       0.000000
       0.000000
       0.500000
    
    
      19
       ?
                       currency
        8
        3
       False
       False
       0.000000
       0.000000
       0.000000
       1.000000
       0.000000
       0.000000
       0.000000
       0.000000
    
    
      20
       ?
                       currency
        8
        3
       False
       False
       0.000000
       0.000000
       0.000000
       1.000000
       0.000000
       0.000000
       0.000000
       0.000000
    
    
      21
       ?
                       currency
        8
        3
       False
       False
       0.000000
       0.000000
       0.000000
       1.000000
       0.000000
       0.000000
       0.000000
       0.000000
    
    
      22
       ?
                 contract_value
       14
       23
       False
       False
       1.000000
       0.521739
       1.000000
       1.000000
       0.000000
       0.000000
       0.000000
       0.521739
    
    
      23
       ?
                 contract_value
       14
       23
       False
       False
       1.000000
       0.521739
       1.000000
       1.000000
       0.000000
       0.000000
       0.000000
       0.521739
    
    
      24
       ?
                 contract_value
       14
       23
       False
       False
       1.000000
       0.521739
       1.000000
       1.000000
       0.000000
       0.000000
       0.000000
       0.521739
    
  

25 rows × 14 columns



In [19]:

    
model = Model(keywords)
results = model.predict(headers, features = [length, word_count, header_in_entity, entity_in_header] + entity_features)

for header, result in results:
    print header.ljust(50), result









    



language                                           supplier
procurement_entity_name                            ?
title                                              supplier
reference_number                                   solicitation
solicitation_number                                solicitation
contract_sequence_number                           solicitation
contract_number                                    solicitation
publishing_status                                  solicitation
award_date                                         solicitation
publication_date                                   ?
amendment_date                                     ?
gsin                                               good
contract_award_procedure                           ?
tendering_procedure                                solicitation
procurement_entity                                 solicitation
end_user_entity                                    solicitation
customer_info                                      solicitation
description                                        buyer
supplier_info                                      solicitation
currency                                           buyer
currency                                           buyer
currency                                           buyer
contract_value                                     ?
contract_value                                     ?
contract_value                                     ?



In [ ]:

    
[obj['header'] for obj in unops]

	entity	header	length	word_count	header_in_entity	entity_in_header	entity_notice	entity_good	entity_solicitation	entity_contract	entity_supplier	entity_authority	entity_buyer	entity_?
0	?	language	8	6	False	False	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
1	?	procurement_entity_name	23	17	False	False	0.882353	0.882353	0.882353	0.882353	0.882353	0.882353	0.941176	0.882353
2	?	title	5	12	False	False	0.000000	1.000000	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000
3	?	reference_number	16	28	False	False	0.035714	0.607143	1.000000	0.071429	0.000000	0.000000	0.000000	1.000000
4	?	solicitation_number	19	20	False	False	0.050000	0.850000	0.850000	0.050000	0.000000	0.000000	0.000000	0.900000
5	?	contract_sequence_number	24	36	False	False	0.361111	0.805556	0.805556	0.361111	0.000000	0.000000	0.000000	0.805556
6	?	contract_number	15	29	False	False	0.448276	1.000000	1.000000	0.448276	0.000000	0.000000	0.000000	1.000000
7	?	publishing_status	17	6	False	False	0.333333	0.000000	1.000000	0.500000	0.000000	0.000000	0.000000	0.000000
8	?	award_date	10	18	False	False	1.000000	0.277778	1.000000	1.000000	0.277778	0.277778	0.000000	0.277778
9	?	publication_date	16	17	False	False	0.764706	0.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000
10	?	amendment_date	14	15	False	False	0.866667	0.000000	0.866667	0.866667	0.000000	0.000000	0.000000	0.000000
11	?	gsin	4	16	False	False	0.000000	1.000000	0.437500	0.000000	0.000000	0.000000	0.000000	0.000000
12	?	contract_award_procedure	24	21	False	False	0.809524	0.809524	1.000000	0.809524	0.238095	0.285714	0.000000	1.000000
13	?	tendering_procedure	19	8	False	False	0.500000	0.000000	1.000000	0.500000	0.125000	0.125000	0.000000	0.625000
14	?	procurement_entity	18	2	False	False	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.500000	0.000000
15	?	end_user_entity	15	22	False	False	0.000000	0.000000	0.818182	0.818182	0.000000	0.000000	0.045455	0.000000
16	?	customer_info	13	2	False	False	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.500000
17	?	description	11	3	False	False	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
18	?	supplier_info	13	2	False	False	0.000000	0.000000	0.000000	0.000000	0.500000	0.000000	0.000000	0.500000
19	?	currency	8	3	False	False	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000
20	?	currency	8	3	False	False	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000
21	?	currency	8	3	False	False	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000
22	?	contract_value	14	23	False	False	1.000000	0.521739	1.000000	1.000000	0.000000	0.000000	0.000000	0.521739
23	?	contract_value	14	23	False	False	1.000000	0.521739	1.000000	1.000000	0.000000	0.000000	0.000000	0.521739
24	?	contract_value	14	23	False	False	1.000000	0.521739	1.000000	1.000000	0.000000	0.000000	0.000000	0.521739